Using RNA rlog normalized counts, and CAGE rlog normalized:
1. In general RNA and CAGE agrees with each other. bam_RNA seems to lose resolution in lowly expressed region, where CAGE has more spread. CAGE has more sequencing depth, maybe better resolution for lowly expressed genes.
2. do 3D plotting of bam_RNA, HS72_RNA and bam_CAGE to decide where to draw cutoff for off genes. In the end decided that whoever makes it to consensus cluster by tpm filter is ON.
suppressMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'tidyr' was built under R version 3.3.2
## Warning: package 'readr' was built under R version 3.3.2
## Warning: package 'stringr' was built under R version 3.3.2
## Warning: package 'forcats' was built under R version 3.3.2
suppressMessages(library(magrittr))
library(plotly)
## Warning: package 'plotly' was built under R version 3.3.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
options(tibble.width = Inf)
options(scipen=999)
source("~/Dropbox/coding/R/my_package/genomicsDL/R/get_TSS.r")
plot_range=c(-3, 18)
off_cut_CAGE=4.8
off_cut_RNA=7.5
BDGP6_coding_200bp_intersect_consensusCluster <-read.delim("0BDGP6_coding_TSS200bp_intersect_consensusCluster.txt", header=FALSE, stringsAsFactors=FALSE)
rlog <- read.delim("DE_results_rlog_RNA_all.txt", header=T, stringsAsFactors=FALSE) %>% mutate(bam_RNA=(SHS.1.1.HS0hr+SHS.2.1.HS0hr)/2, HS48_RNA=(SHS.1.2.HS48hr+SHS.2.2.HS48hr)/2, HS72_RNA=(SHS.1.3.HS72hr+SHS.2.3.HS72hr)/2, Sa_RNA=(SHS.1.8.Sa+SHS.2.8.Sa)/2, Aly_RNA=(SHS.1.9.Aly+SHS.2.9.Aly)/2 )
CAGE_rlog_BH <- read.delim("DE_normalized_counts_rlog_CAGE_BH_up500_5utr_CDS.txt", stringsAsFactors=FALSE) %>% mutate(bam_CAGE = (B1 + B2) / 2, HS72_CAGE = (H1 + H2) / 2 ) %>% select(X,bam_CAGE, HS72_CAGE)
logt_c = inner_join(rlog, BDGP6_coding_200bp_intersect_consensusCluster, by=c("X"="V4")) %>% inner_join(CAGE_rlog_BH, by=c("V10"="X"))
names(logt_c)
## [1] "X" "SHS.1.1.HS0hr" "SHS.1.2.HS48hr" "SHS.1.3.HS72hr"
## [5] "SHS.1.8.Sa" "SHS.1.9.Aly" "SHS.2.1.HS0hr" "SHS.2.2.HS48hr"
## [9] "SHS.2.3.HS72hr" "SHS.2.8.Sa" "SHS.2.9.Aly" "bam_RNA"
## [13] "HS48_RNA" "HS72_RNA" "Sa_RNA" "Aly_RNA"
## [17] "V1" "V2" "V3" "V5"
## [21] "V6" "V7" "V8" "V9"
## [25] "V10" "V11" "V12" "V13"
## [29] "bam_CAGE" "HS72_CAGE"
ggplot(logt_c, aes(bam_RNA, HS72_RNA)) +
xlab("log2(bam_RNA)") + xlim(plot_range) + ylim(plot_range) +
ylab("log2(HS72_RNA)") +
coord_fixed(ratio=1) +geom_vline(xintercept = off_cut_RNA, color='red') +
geom_bin2d(bins = 150) + scale_fill_distiller(palette = "YlGnBu", direction = -1)
ggplot(logt_c, aes(bam_CAGE, HS72_CAGE)) +
xlab("log2(bam_RNA)") + xlim(plot_range) + ylim(plot_range) +
ylab("log2(HS72_CAGE)") +
coord_fixed(ratio=1) +
#geom_vline(xintercept = off_cut_CAGE, color='red') +
#geom_hline(yintercept = off_cut_CAGE, color='red') +
geom_abline(intercept = 2, slope=1, color='yellow') +
geom_abline(intercept = 4, slope=1, color='black') +
geom_bin2d(bins = 150) + scale_fill_distiller(palette = "YlGnBu", direction = -1)
## Warning: Removed 2 rows containing non-finite values (stat_bin2d).
ggplot(logt_c, aes(bam_RNA, bam_CAGE)) +
geom_point(alpha = 3/10, size = 0.3, color="grey42") +
xlab("log2(bam_RNA)") + xlim(plot_range) + ylim(plot_range) +
ylab("log2(bam_CAGE)") +
coord_fixed(ratio=1) +
ggtitle("all transcripts") + geom_abline(intercept = 0, slope = 1, color='red') +
#geom_hline(yintercept = off_cut_CAGE, color='red') +
theme(plot.title = element_text(hjust = 0.5)) + theme_light()
## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(logt_c, aes(bam_RNA, bam_CAGE)) +
xlab("log2(bam_RNA)") + xlim(plot_range) + ylim(plot_range) +
ylab("log2(bam_CAGE)") +
coord_fixed(ratio=1) +geom_abline(intercept = 0, slope = 1, color='red') +
geom_bin2d(bins = 150) + scale_fill_distiller(palette = "YlGnBu", direction = -1)
## Warning: Removed 2 rows containing non-finite values (stat_bin2d).
ggplot(logt_c, aes(HS72_RNA, HS72_CAGE)) +
geom_point(alpha = 3/10, size = 0.3, color="grey42") +
xlab("log2(HS72_RNA)") + xlim(plot_range) + ylim(plot_range) +
ylab("log2(HS72_CAGE)") +
coord_fixed(ratio=1) +
ggtitle("all transcripts") +geom_abline(intercept = 0, slope = 1, color='red') +
#geom_hline(yintercept = off_cut_CAGE, color='red') +
theme(plot.title = element_text(hjust = 0.5)) + theme_light()
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(logt_c, aes(HS72_RNA, HS72_CAGE)) +
xlab("log2(HS72_RNA)") + xlim(plot_range) + ylim(plot_range) +
ylab("log2(HS72_CAGE)") +
coord_fixed(ratio=1) +geom_abline(intercept = 0, slope = 1, color='red') +
geom_bin2d(bins = 150) + scale_fill_distiller(palette = "YlGnBu", direction = -1)
## Warning: Removed 1 rows containing non-finite values (stat_bin2d).
p=plot_ly(slice(logt_c,1:5000), x=~bam_RNA, y=~HS72_RNA, z=~bam_CAGE, opacity=0.5, marker=list(size=2), mode="markers", type="scatter3d") #%>% add_trace(x=c(-4,4), y=c(-4,4), z=c(-4,4), mode="lines")
p
p=plot_ly(slice(logt_c,1:5000), x=~bam_RNA, y=~HS72_RNA, z=~HS72_CAGE, opacity=0.5, marker=list(size=2), mode="markers", type="scatter3d") #%>% add_trace(x=c(-4,4), y=c(-4,4), z=c(-4,4), mode="lines")
p
bam_CAGE_off=filter(logt_c, bam_CAGE <= off_cut_CAGE) %>% select(X) %>% unique()
nrow(bam_CAGE_off)
## [1] 2622
nrow(unique(filter(logt_c, bam_CAGE <= off_cut_CAGE & bam_RNA <= off_cut_RNA)))
## [1] 2008
these are the alternative TSS usage ones!
nrow(unique(filter(logt_c, bam_CAGE < off_cut_CAGE & bam_RNA > 10)))
## [1] 727
nrow(unique(select(filter(logt_c, bam_CAGE < off_cut_CAGE & bam_RNA > 10),X)))
## [1] 529
x=filter(logt_c, bam_CAGE < 0 & bam_RNA > 10 & bam_RNA < 12 & HS72_RNA >10 & HS72_RNA < 12)
x_bed=select(x,V7:V12)
#write.table(x_bed,"bam_CAGE_off_RNA_on.bed", sep="\t", quote=F, row.names = F, col.names = F)
very few genes. maybe just for some reason didn’t pull by cap in CAGE?
nrow(unique(filter(logt_c, HS72_CAGE < 2 & HS72_RNA > off_cut_RNA)))
## [1] 13
nrow(unique(select(filter(logt_c, HS72_CAGE < 2 & HS72_RNA > off_cut_RNA),X)))
## [1] 12
y=filter(logt_c, HS72_CAGE < 2 & HS72_RNA > off_cut_RNA)
y_bed=select(y,V7:V12)
#write.table(y_bed,"HS72_CAGE_off_RNA_on.bed", sep="\t", quote=F, row.names = F, col.names = F)
I thought that they are below detection level for RNAseq, BUT they are the multimappers that I failed to filter out of CAGE Q255.chr.bam!!!!! Fixed in this version.
nrow(unique(select(filter(logt_c, bam_RNA < 1 & bam_CAGE > off_cut_CAGE),X)))
## [1] 3
nrow(unique(select(filter(logt_c, HS72_RNA < 1 & HS72_CAGE > off_cut_CAGE),X)))
## [1] 1
zb=filter(logt_c, bam_RNA < 1 & bam_CAGE > off_cut_CAGE) %>% select(V7:V12)
zh=filter(logt_c, HS72_RNA < 1 & HS72_CAGE > off_cut_CAGE)%>% select(V7:V12)
nrow(unique(inner_join(select(filter(logt_c, bam_RNA < 1 & bam_CAGE > off_cut_CAGE),X), select(filter(logt_c, HS72_RNA < 1 & HS72_CAGE > off_cut_CAGE),X))))
## Joining, by = "X"
## [1] 0
#write.table(zb, "bam_RNA_off_CAGE_on.bed", sep="\t", quote=F, row.names = F, col.names = F)
#write.table(zh, "HS72_RNA_off_CAGE_on.bed", sep="\t", quote=F, row.names = F, col.names = F)
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.7.1 magrittr_1.5 forcats_0.2.0 stringr_1.2.0
## [5] dplyr_0.8.0.1 purrr_0.3.1 readr_1.1.1 tidyr_0.7.2
## [9] tibble_2.0.1 ggplot2_2.2.1 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 reshape2_1.4.2 haven_1.1.0
## [4] lattice_0.20-35 colorspace_1.3-2 htmltools_0.3.6
## [7] viridisLite_0.3.0 yaml_2.1.14 rlang_0.3.1
## [10] pillar_1.3.1 foreign_0.8-69 glue_1.3.0
## [13] RColorBrewer_1.1-2 modelr_0.1.1 readxl_1.0.0
## [16] plyr_1.8.4 munsell_0.4.3 gtable_0.2.0
## [19] cellranger_1.1.0 rvest_0.3.2 htmlwidgets_0.9
## [22] psych_1.7.8 evaluate_0.10.1 labeling_0.3
## [25] knitr_1.17 httpuv_1.3.5 crosstalk_1.0.0
## [28] parallel_3.3.1 broom_0.4.2 Rcpp_1.0.0
## [31] xtable_1.8-2 scales_0.5.0 backports_1.1.1
## [34] jsonlite_1.5 mime_0.5 mnormt_1.5-5
## [37] hms_0.3 digest_0.6.12 stringi_1.1.5
## [40] shiny_1.0.5 grid_3.3.1 rprojroot_1.2
## [43] cli_1.0.1 tools_3.3.1 lazyeval_0.2.1
## [46] crayon_1.3.4 pkgconfig_2.0.2 data.table_1.10.4-3
## [49] xml2_1.1.1 lubridate_1.7.1 assertthat_0.2.0
## [52] rmarkdown_1.6 httr_1.3.1 rstudioapi_0.7
## [55] R6_2.2.2 nlme_3.1-131